#install.packages("glmnet")
library(glmnet)
## Loading required package: Matrix
## Loaded glmnet 4.1-6

Regularization

Let’s recreate the simulations from lecture. We assume the predictors are normally distributed (with differing means and standard deviations) and the response is assumed \(Y=20 + 3X_1 - 2X_2 + \epsilon\) where epsilon is normal \((\mu=0, \sigma^2=4)\)

set.seed(35521)
x1 <- rnorm(30,0,3)
x2 <- rnorm(30,1,4)
y <- 20 + 3*x1 - 2*x2 + rnorm(length(x1), sd=2)
plot(y~x1+x2)

x <- cbind(x1, x2)

Now we fit a standard linear model to see how close we are to estimating the true model…

linmod <- lm(y~x1+x2)
summary(linmod)
## 
## Call:
## lm(formula = y ~ x1 + x2)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -4.7524 -0.9086 -0.1260  1.7005  3.0130 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  19.4425     0.3636   53.47   <2e-16 ***
## x1            3.0808     0.1335   23.07   <2e-16 ***
## x2           -2.1267     0.1097  -19.39   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 1.941 on 27 degrees of freedom
## Multiple R-squared:  0.9734, Adjusted R-squared:  0.9714 
## F-statistic: 494.4 on 2 and 27 DF,  p-value: < 2.2e-16

Now ridge regression…

rrsim_d <- cv.glmnet(x, y, alpha=0)
plot(rrsim_d$glmnet.fit, label=TRUE, xvar="lambda")

plot(rrsim_d)

rrsim_d$lambda.min
## [1] 0.8770535

Since the minimum was determined nearly at the lower boundary of \(\lambda\), it’s worthwhile to try a different grid of \(\lambda\) values than what the default provides…

grid <- exp(seq(10, -6, length=100))
set.seed(451642)
rrsim <- cv.glmnet(x, y, alpha=0, lambda=grid)
plot(rrsim$glmnet.fit, label=TRUE, xvar="lambda")

plot(rrsim)

rrsim$lambda.min
## [1] 0.1198862

Sure enough, the minimal predict error arises from a smaller \(\lambda\) than our original grid tested. We can now look at the actual model fits for the suggested values of lambda…

lammin <- rrsim$lambda.min
lam1se <- rrsim$lambda.1se
rrsimmin <- glmnet(x,y,alpha = 0, lambda=lammin)
rrsim1se <- glmnet(x,y,alpha = 0, lambda=lam1se)
coef(rrsimmin)
## 3 x 1 sparse Matrix of class "dgCMatrix"
##                    s0
## (Intercept) 19.458267
## x1           3.050425
## x2          -2.106344
coef(rrsim1se)
## 3 x 1 sparse Matrix of class "dgCMatrix"
##                    s0
## (Intercept) 19.581891
## x1           2.811993
## x2          -1.946284

Lets throw those results into a table for easy comparison…

coeftab <- cbind(c(20.00,3.00,-2.00), coef(linmod), coef(rrsimmin), coef(rrsim1se))
colnames(coeftab) <- c("TRUE", "LM", "RidgeRegMin", "RidgeReg1se")
round(coeftab,2)
## 3 x 4 sparse Matrix of class "dgCMatrix"
##             TRUE    LM RidgeRegMin RidgeReg1se
## (Intercept)   20 19.44       19.46       19.58
## x1             3  3.08        3.05        2.81
## x2            -2 -2.13       -2.11       -1.95

So ridge regression with the \(\lambda\) with estimated minimum MSE from CV provides estimates closer to the true values than the standard linear model approach as well as the ridge regression with \(\lambda\) within 1 standard error of the minimum. We can also run lasso using the glmnet command, the only difference is setting \(\alpha=1\) instead of 0…

lasim <- cv.glmnet(x, y, alpha=1, lambda=grid)
plot(lasim$glmnet.fit, label=TRUE, xvar="lambda")

plot(lasim)

lammin <- lasim$lambda.min
lam1se <- lasim$lambda.1se
lasimmin <- glmnet(x,y,alpha = 1, lambda=lammin)
lasim1se <- glmnet(x,y,alpha = 1, lambda=lam1se)
coef(lasimmin)
## 3 x 1 sparse Matrix of class "dgCMatrix"
##                    s0
## (Intercept) 19.457600
## x1           3.050666
## x2          -2.101920
coef(lasim1se)
## 3 x 1 sparse Matrix of class "dgCMatrix"
##                    s0
## (Intercept) 19.565775
## x1           2.834640
## x2          -1.924461
coeftab <- cbind(c(20.00,3.00,-2.00), coef(linmod), coef(rrsimmin), coef(lasimmin))
colnames(coeftab) <- c("TRUE", "LM", "RidgeRegMin", "LassoMin")
round(coeftab,2)
## 3 x 4 sparse Matrix of class "dgCMatrix"
##             TRUE    LM RidgeRegMin LassoMin
## (Intercept)   20 19.44       19.46    19.46
## x1             3  3.08        3.05     3.05
## x2            -2 -2.13       -2.11    -2.10

Results are essentially the same between ridge regression and lasso on this simulation. This is because to approach the true model, we only need to very slightly shrink the original estimates. Furthermore, there are no useless predictors to remove in this case. Let’s change that…

Useless Predictors

Let’s set up a model similar to one you would have seen in 570. A bunch of independent and useless predictors!

set.seed(52141)
bmat <- matrix(rnorm(50000), nrow=100)
dim(bmat)
## [1] 100 500
y <- rnorm(100)
bsimcv <- cv.glmnet(bmat, y, alpha=1)
plot(bsimcv)

plot(bsimcv$glmnet.fit, label=TRUE, xvar="lambda")

In this case, the lambda within 1 standard error appears to be at the boundary. So lets again manually specify a grid of lambda to see what happens…

grid <- exp(seq(-6, -1, length=100))
bsimcv2 <- cv.glmnet(bmat, y, alpha=1, lambda=grid)
plot(bsimcv2)

Interesting! The model that removes all predictors (and thereby only models according to the intercept) has a CV estimate of the MSE within 1 standard error of the minimum predicted MSE. This is a telltale sign that there are probably no useful predictors sitting in this data set!

Code from the interactive app

Heres how those 3d plots were generated. Note that it’s a bit hacky, as I’m simply using points rather than a surface. I’ve spent a significant amount of time investigating other surface plots, and then deciding it was all too finicky to pull off and reverting back to this setup!

First I’ll setup the data

library(plotly)
## Loading required package: ggplot2
## 
## Attaching package: 'plotly'
## The following object is masked from 'package:ggplot2':
## 
##     last_plot
## The following object is masked from 'package:stats':
## 
##     filter
## The following object is masked from 'package:graphics':
## 
##     layout
set.seed(35521)
x1 <- rnorm(30,0,3)
x2 <- x1+rnorm(30,0,1)
y <- 20 + 3*x1 + rnorm(length(x1), sd=1)
x <- cbind(x1, x2)
pairs(cbind(y,x))

now a grid for plotting…

b1b2 <- expand.grid(seq(-4, 4, by=0.1), seq(-4, 4, by=0.1))
b1b2 <- cbind(b1b2, rowSums(abs(b1b2)), rowSums(b1b2^2))
inters <- apply(b1b2, 1, function(v) mean(y)-v[1]*mean(x1)-v[2]*mean(x2))
params <- cbind(inters, b1b2)
rssgrid <- apply(params, 1, function(v) sum((y-(v[1]+v[2]*x1+v[3]*x2))^2))
plotit <- data.frame(b1=b1b2[,1], b2=b1b2[,2], rss=rssgrid)
###essentially unconstrained (penalty weak enough that global minimum exists)
library(plotly)
scene = list(camera = list(eye = list(x = -1.25, y = 1.25, z = 0)))
rr <- plotit[b1b2[,4]<=40,]
la <- plotit[b1b2[,3]<=10,]

Here’s the unconstrained ridge regression

plot_ly(rr, x=~b1, y=~b2, z=~rss, marker = list(color = ~rss, colorscale = c('#FFE1A1', '#683531'), showscale = TRUE)) %>%
  add_markers()

and the unconstrained LASSO…

plot_ly(la, x=~b1, y=~b2, z=~rss, marker = list(color = ~rss, colorscale = c('#FFE1A1', '#683531'), showscale = TRUE)) %>%
  add_markers()

now a bit more constraining…RR

#slight constraint
rr <- plotit[b1b2[,4]<=30,]
la <- plotit[b1b2[,3]<=7,]
plot_ly(rr, x=~b1, y=~b2, z=~rss, marker = list(color = ~rss, colorscale = c('#FFE1A1', '#683531'), showscale = TRUE)) %>%
  add_markers()

and LASSO

plot_ly(la, x=~b1, y=~b2, z=~rss, marker = list(color = ~rss, colorscale = c('#FFE1A1', '#683531'), showscale = TRUE)) %>%
  add_markers()

…more constraining..RR

rr <- plotit[b1b2[,4]<=20,]
la <- plotit[b1b2[,3]<=5,]
plot_ly(rr, x=~b1, y=~b2, z=~rss, marker = list(color = ~rss, colorscale = c('#FFE1A1', '#683531'), showscale = TRUE)) %>%
  add_markers()

…and LASSO

plot_ly(la, x=~b1, y=~b2, z=~rss, marker = list(color = ~rss, colorscale = c('#FFE1A1', '#683531'), showscale = TRUE)) %>%
  add_markers()

heavier constraints..RR

rr <- plotit[b1b2[,4]<=10,]
la <- plotit[b1b2[,3]<=2,]
plot_ly(rr, x=~b1, y=~b2, z=~rss, marker = list(color = ~rss, colorscale = c('#FFE1A1', '#683531'), showscale = TRUE)) %>%
  add_markers()

LASSO

plot_ly(la, x=~b1, y=~b2, z=~rss, marker = list(color = ~rss, colorscale = c('#FFE1A1', '#683531'), showscale = TRUE)) %>%
  add_markers()

and more…RR

rr <- plotit[b1b2[,4]<=1,]
la <- plotit[b1b2[,3]<=1,]
plot_ly(rr, x=~b1, y=~b2, z=~rss, marker = list(color = ~rss, colorscale = c('#FFE1A1', '#683531'), showscale = TRUE)) %>%
  add_markers()
plot_ly(la, x=~b1, y=~b2, z=~rss, marker = list(color = ~rss, colorscale = c('#FFE1A1', '#683531'), showscale = TRUE)) %>%
  add_markers()

and finally more again…RR

rr <- plotit[b1b2[,4]<=0.5,]
la <- plotit[b1b2[,3]<=0.5,]
plot_ly(rr, x=~b1, y=~b2, z=~rss, marker = list(color = ~rss, colorscale = c('#FFE1A1', '#683531'), showscale = TRUE)) %>%
  add_markers()

and LASSO.

plot_ly(la, x=~b1, y=~b2, z=~rss, marker = list(color = ~rss, colorscale = c('#FFE1A1', '#683531'), showscale = TRUE)) %>%
  add_markers()